extractLegend <- function(a.gplot) {
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)}

## LDA topics on forum sample (50,000 posts)
load("02_20_m5s_forum_lda.RData")
require(topicmodels)

forum_text_lda$lda_topic_int <- topics(lda)

# topicmodels::terms(lda, 30)
# table(topics(lda))

topic_labels <- list('1' = 'environment', '2' = 'M5S - online participation', '3' = 'law-making', 
                     '4' = 'law-making', '5' = 'M5S - election and representation',
                     '6' = 'undefined', '7' = 'local institutions', '8' = 'M5S - national politics',
                     '9' = 'M5S - needs and change', '10' = 'rights - personal, family, cultural',
                     '11' = 'macroeconomy', '12' = 'liberalisation - drugs and prostitution', 
                     '13' = 'employment and education', '14' = 'undefined', '15' = 'microeconomy')

lda_topics_tbl <-
  as.data.frame(topicmodels::terms(lda, 20))
colnames(lda_topics_tbl) <- topic_labels

topic_labels_ordered <-
  as.character(unlist(topic_labels)[c(2,5,8,9,11,15,1,13,3,10,7,12,6)])

forum_text_lda$lda_topic_label <- 
  as.character(topic_labels[as.character(forum_text_lda$lda_topic_int)])

topic_freq_df <- as.data.frame(table(forum_text_lda$lda_topic_label))
topic_freq_df$Var1 <- factor(topic_freq_df$Var1, levels = topic_labels_ordered)
topic_freq_df$Var1 <- factor(topic_freq_df$Var1, levels = rev(levels(topic_freq_df$Var1)))

## Histogram
lda_forum_hist <- ggplot(topic_freq_df, aes(Var1, Freq)) + geom_bar(stat='identity') +
  coord_flip() + labs(x = NULL, y = "Number of postings (n = 50,000)")

save(lda_forum_hist, file = '02_22_lda_m5s_forum_hist.RData')

# Time series

Sys.setenv(TZ='GMT') # important

forum_text_lda$date_class <- as.Date(as.POSIXct(forum_text_lda$date, origin = "1970-01-01"))
forum_text_lda$week <- cut(forum_text_lda$date_class, breaks = "weeks")
forum_text_lda$month <- cut(forum_text_lda$date_class, breaks = "months")

require(dplyr)
weekly_lda_aggregate <-
  subset(forum_text_lda, lda_topic_label != 'undefined') %>%
  dplyr::group_by(week, lda_topic_label) %>%
  dplyr::summarize(label_count = n())

weekly_lda_aggregate <-
  merge(weekly_lda_aggregate, 
        {subset(forum_text_lda, lda_topic_label != 'undefined') %>%
            dplyr::group_by(week) %>%
            dplyr::summarize(weekly_count = n())},
        by = 'week')
weekly_lda_aggregate$perc <- with(weekly_lda_aggregate, label_count / weekly_count)

lda_forum_ts_plot <- 
  ggplot(subset(weekly_lda_aggregate, 
                lda_topic_label %in% c('environment', 'M5S - national politics')),
         aes(x = as.Date(week), y = perc, group = lda_topic_label, colour = lda_topic_label)) +
  geom_line(alpha = 0.3) + 
  geom_smooth(se = FALSE) + coord_cartesian(ylim=c(0, .5)) + scale_y_continuous(labels = scales::percent) +
  labs(x = NULL, y = "Over total number of postings", colour="") +
  theme(legend.position="bottom")

## LDA from BLOG
load("02_01_m5s_blog_post_lda.RData")

blog_post$lda_topic <- topics(lda)
# terms(lda, 20)
# 12 = 'environment', 4 = 'M5S - national politics'

blog_post$date_class <- 
  as.Date(blog_post$date)
blog_post$week <- cut(blog_post$date_class, breaks = "weeks")

blog_post_weekly <-
  blog_post %>%
  dplyr::group_by(week) %>%
  dplyr::summarize(environemnt = sum(lda_topic == 12),
            national_pol = sum(lda_topic == 4))
require(reshape2)
blog_post_weekly <- reshape2::melt(blog_post_weekly, id = "week")

blog_post_weekly <- merge(blog_post_weekly,
                          {blog_post_weekly <-
                            blog_post %>%
                            group_by(week) %>%
                            dplyr::summarize(tot_weekly_posts = n())},
                          by = "week")

# Merge forum and blog LDA
sbst_blog_lda <- subset(blog_post_weekly, variable %in% c("environemnt","national_pol"))
sbst_blog_lda$perc <- with(blog_post_weekly, value / tot_weekly_posts)
sbst_blog_lda$value <- NULL
sbst_blog_lda$tot_weekly_posts <- NULL
sbst_blog_lda$source <- "blog"

sbst_forum_lda <- 
  subset(weekly_lda_aggregate, lda_topic_label %in% c('environment', 'M5S - national politics'))
sbst_forum_lda$label_count <- NULL
sbst_forum_lda$weekly_count <- NULL
sbst_forum_lda$source <- "forum"
colnames(sbst_forum_lda)[2] <- "variable"

sbst_blog_forum_lda <- rbind(sbst_blog_lda, sbst_forum_lda)
require(plyr)
sbst_blog_forum_lda$variable <- 
  revalue(sbst_blog_forum_lda$variable, 
          c("environemnt"="environment", "national_pol"="national politics",
            "M5S - national politics"="national politics"))

sbst_blog_forum_lda$week <- as.Date(sbst_blog_forum_lda$week)

require(grid)
require(gridExtra)
require(scales)
require(ggplot2)

lda_blog_ts <- ggplot(subset(sbst_blog_forum_lda, source == "blog"), 
       aes(x = week, y = perc, group = variable, colour = variable)) +
  # geom_line() +
  stat_smooth(se = FALSE) + scale_y_continuous(labels = scales::percent) +
  labs(x = NULL, y = "Blog", colour = "") +
  scale_x_date(limits = c(min(sbst_blog_forum_lda$week), 
                          max(sbst_blog_forum_lda$week))) +
  theme(legend.position="bottom", legend.key = element_blank(),
        axis.line.x=element_blank(), axis.text.x=element_blank(),
        axis.ticks.x=element_blank(), plot.margin=unit(c(.5,.5,0,.5), "cm"))
lda_forum_ts <- ggplot(subset(sbst_blog_forum_lda, source == "forum"), 
       aes(x = week, y = perc, group = variable, colour = variable)) +
  # geom_line() +
  stat_smooth(se = FALSE) + scale_y_continuous(labels = scales::percent) +
  labs(x = NULL, y = "Forum") +
  scale_x_date(limits = c(min(sbst_blog_forum_lda$week), 
                          max(sbst_blog_forum_lda$week))) +
  theme(plot.margin=unit(c(0,.5,.5,.5), "cm"), legend.position="bottom", legend.title=element_blank())

legend_lda_forum_ts <- extractLegend(lda_forum_ts)

save(lda_blog_ts, lda_forum_ts, legend_lda_forum_ts, 
     file = "02_22_lda_blog_and_forum_ts.RData")

## WHEN smooths cross
ggplot(subset(sbst_blog_forum_lda, source == "forum" & variable == "environment"), 
       aes(x = week, y = perc)) +
  stat_smooth(aes(outfit=forum_environment <<-..y..))
ggplot(subset(sbst_blog_forum_lda, source == "forum" & variable == "national politics"), 
       aes(x = week, y = perc)) +
  stat_smooth(aes(outfit=forum_politics <<-..y..))

ggplot(subset(sbst_blog_forum_lda, source == "blog" & variable == "environment"), 
       aes(x = week, y = perc)) +
  stat_smooth(aes(outfit=blog_environment<<-..y..))
ggplot(subset(sbst_blog_forum_lda, source == "blog" & variable == "national politics"), 
       aes(x = week, y = perc)) +
  stat_smooth(aes(outfit=blog_politics<<-..y..))

seq_date_forum_80 <- 
  seq(min(subset(sbst_blog_forum_lda, source == "forum")$week), 
      max(subset(sbst_blog_forum_lda, source == "forum")$week), length.out = 80)
seq_date_blog_80 <- 
  seq(min(subset(sbst_blog_forum_lda, source == "blog")$week), 
      max(subset(sbst_blog_forum_lda, source == "blog")$week), length.out = 80)

# diff_forum <- forum_environment - forum_politics
# seq_date_forum_80[which.min(abs(diff_forum))] 
# "2012-07-03"

# diff_blog <- blog_environment - blog_politics
# seq_date_blog_80[which.min(abs(diff_blog))]
# "2010-08-30"
# round(1-(blog_environment[2:80]/blog_environment[1:79]),2)
# seq_date_blog_80[51]

## SLOPEGRAPH
# Slope graph
require(slopegraph)
mid_date <- 
  min(as.Date(weekly_lda_aggregate$week)) +
  ((max(as.Date(weekly_lda_aggregate$week)) -
      min(as.Date(weekly_lda_aggregate$week)))/2)

forum_text_lda$period <- 
  sapply(forum_text_lda$date_class, function(x) if (as.Date(x) <= mid_date) 1 else 2)

require(dplyr)
halves_aggregate <-
  forum_text_lda %>%
  group_by(lda_topic_label, period) %>%
  dplyr::summarize(label_count = n())

halves_aggregate <- subset(halves_aggregate, lda_topic_label != 'undefined')
halves_aggregate <-
  merge(halves_aggregate, 
        {halves_aggregate %>%
            group_by(period) %>%
            dplyr::summarize(total_period = sum(label_count))},
        by = c("period"))
halves_aggregate$perc <-
  round((halves_aggregate$label_count / halves_aggregate$total_period) * 100,1)
halves_aggregate <- dcast(halves_aggregate, lda_topic_label ~ period, value.var = "perc")
row.names(halves_aggregate) <- halves_aggregate$lda_topic_label
halves_aggregate$lda_topic_label <- NULL

save(halves_aggregate, file ="02_22_halves_aggregate.RData")

## USER analysis (first last months)

forum_text_lda_first6m <-
  subset(forum_text_lda, lda_topic_label != 'undefined' & date_class < min(date_class) + 30 * 6)

forum_text_lda_last6m <-
  subset(forum_text_lda, lda_topic_label != 'undefined' & date_class > max(date_class) - 30 * 6)

forum_text_lda_first6m <- forum_text_lda_first6m[order(forum_text_lda_first6m$date_class),]
forum_text_lda_first6m <- forum_text_lda_first6m[!duplicated(forum_text_lda_first6m$author_unique_id),]

forum_text_lda_last6m <- forum_text_lda_last6m[order(forum_text_lda_last6m$date_class),]
forum_text_lda_last6m <- forum_text_lda_last6m[!duplicated(forum_text_lda_last6m$author_unique_id),]

sum(forum_text_lda_first6m$lda_topic_label == 'M5S - national politics') / nrow(forum_text_lda_first6m)
sum(forum_text_lda_last6m$lda_topic_label == 'M5S - national politics') / nrow(forum_text_lda_last6m)

# Users active before 31 Dec 2011 and also active between 2012-2013

longterm_users <- 
  intersect(
    unique(subset(forum_text_lda, date_class < as.Date("2011-07-01"))$author_unique_id),
    unique(subset(forum_text_lda, date_class >= as.Date("2012-07-01"))$author_unique_id))

require(dplyr)
forum_text_lda_lt <- subset(forum_text_lda, 
                            author_unique_id %in% longterm_users &
                              lda_topic_label != 'undefined')
  
# prop.table(table(subset(forum_text_lda_lt, period==1)$lda_topic_label))[c('environment','M5S - national politics')]
# prop.table(table(subset(forum_text_lda_lt, period==2)$lda_topic_label))[c('environment','M5S - national politics')]



